09 collections高级容器

Python内置的dict、list、set、tuple已经很强了，但有些场景用起来还是不够方便——比如统计词频、给字典设置默认值、在列表两端快速插入删除。collections模块就是为这些场景准备的，它提供了一批"增强版"的容器类型。

一、Counter：计数器

统计元素出现的次数，用Counter一行搞定。

1.1 基本用法

python

from collections import Counter

# 从列表创建
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
counter = Counter(words)
print(counter)
# Counter({'apple': 3, 'banana': 2, 'cherry': 1})

# 从字符串创建（统计字符）
counter = Counter("abracadabra")
print(counter)
# Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1})

# 从关键字参数创建
counter = Counter(a=3, b=2, c=1)

1.2 获取计数

python

from collections import Counter

counter = Counter(["apple", "apple", "banana", "cherry"])

# 像字典一样访问
counter["apple"]    # 2
counter["grape"]    # 0（不存在时返回0，不是KeyError）

# 获取最常见的元素
counter.most_common(2)  # [('apple', 2), ('banana', 1)]

# 总数
counter.total()  # 4 (3.10+)

1.3 更新计数

python

from collections import Counter

counter = Counter(["apple", "banana"])

# 增加计数
counter.update(["apple", "cherry"])
print(counter)  # Counter({'apple': 2, 'banana': 1, 'cherry': 1})

# 减少计数
counter.subtract(["apple", "apple"])
print(counter)  # Counter({'apple': 0, 'banana': 1, 'cherry': 1})

1.4 集合运算

python

from collections import Counter

c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)

# 加法：合并计数
c1 + c2  # Counter({'a': 4, 'b': 3})

# 减法：只保留正数
c1 - c2  # Counter({'a': 2})

# 交集：取最小值
c1 & c2  # Counter({'a': 1, 'b': 1})

# 并集：取最大值
c1 | c2  # Counter({'a': 3, 'b': 2})

1.5 实用场景

python

from collections import Counter

# 统计词频
text = "the cat sat on the mat the cat"
word_counts = Counter(text.split())
print(word_counts.most_common(3))
# [('the', 3), ('cat', 2), ('sat', 1)]

# 统计日志级别
log_levels = ["INFO", "ERROR", "INFO", "WARN", "ERROR", "INFO"]
level_counts = Counter(log_levels)
print(level_counts)
# Counter({'INFO': 3, 'ERROR': 2, 'WARN': 1})

二、defaultdict：带默认值的字典

普通字典访问不存在的键会报KeyError，defaultdict会自动创建默认值。

2.1 基本用法

python

from collections import defaultdict

# 默认值为list
dd = defaultdict(list)
dd["fruits"].append("apple")
dd["fruits"].append("banana")
dd["colors"].append("red")
print(dd)
# defaultdict(<class 'list'>, {'fruits': ['apple', 'banana'], 'colors': ['red']})

# 默认值为int
dd = defaultdict(int)
dd["apple"] += 1
dd["apple"] += 1
dd["banana"] += 1
print(dd)
# defaultdict(<class 'int'>, {'apple': 2, 'banana': 1})

# 默认值为set
dd = defaultdict(set)
dd["tags"].add("python")
dd["tags"].add("ai")

2.2 分组数据

python

from collections import defaultdict

students = [
    {"name": "大志", "grade": "A"},
    {"name": "小明", "grade": "B"},
    {"name": "小红", "grade": "A"},
    {"name": "小李", "grade": "B"},
]

# 按成绩分组
by_grade = defaultdict(list)
for s in students:
    by_grade[s["grade"]].append(s["name"])

print(dict(by_grade))
# {'A': ['大志', '小红'], 'B': ['小明', '小李']}

2.3 自定义默认工厂

python

from collections import defaultdict

# 默认值为lambda返回的值
dd = defaultdict(lambda: "未知")
dd["name"] = "大志"
print(dd["name"])    # "大志"
print(dd["age"])     # "未知"

# 默认值为字典
dd = defaultdict(dict)
dd["user"]["name"] = "大志"
dd["user"]["age"] = 28

三、deque：双端队列

列表的两端插入删除是O(1)，但中间操作是O(n)。deque在两端的插入删除都是O(1)。

3.1 基本操作

python

from collections import deque

# 创建deque
d = deque([1, 2, 3])

# 两端添加
d.append(4)       # 右端添加: [1, 2, 3, 4]
d.appendleft(0)   # 左端添加: [0, 1, 2, 3, 4]

# 两端弹出
d.pop()           # 右端弹出: [0, 1, 2, 3]
d.popleft()       # 左端弹出: [1, 2, 3]

# 扩展
d.extend([4, 5])      # 右端扩展: [1, 2, 3, 4, 5]
d.extendleft([0, -1]) # 左端扩展（注意顺序）: [-1, 0, 1, 2, 3, 4, 5]

3.2 旋转

python

from collections import deque

d = deque([1, 2, 3, 4, 5])

# 右旋：最后一个元素移到开头
d.rotate(2)
print(d)  # deque([4, 5, 1, 2, 3])

# 左旋：第一个元素移到末尾
d.rotate(-2)
print(d)  # deque([1, 2, 3, 4, 5])

3.3 固定长度

python

from collections import deque

# maxlen：限制最大长度，超出时自动丢弃另一端的元素
d = deque(maxlen=3)
d.append(1)  # [1]
d.append(2)  # [1, 2]
d.append(3)  # [1, 2, 3]
d.append(4)  # [2, 3, 4]（1被丢弃）

3.4 实用场景

python

from collections import deque

# 最近N条记录
recent_logs = deque(maxlen=100)
for log in all_logs:
    recent_logs.append(log)

# 滑动窗口
def sliding_window(iterable, n):
    window = deque(maxlen=n)
    for item in iterable:
        window.append(item)
        if len(window) == n:
            yield tuple(window)

list(sliding_window([1, 2, 3, 4, 5], 3))
# [(1, 2, 3), (2, 3, 4), (3, 4, 5)]

# BFS广度优先搜索
from collections import deque

def bfs(graph, start):
    visited = set()
    queue = deque([start])
    visited.add(start)
    
    while queue:
        node = queue.popleft()
        print(node)
        for neighbor in graph[node]:
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append(neighbor)

四、namedtuple：命名元组

普通元组用索引访问元素，namedtuple可以用名字访问，更清晰。

4.1 创建namedtuple

python

from collections import namedtuple

# 定义namedtuple类型
Point = namedtuple('Point', ['x', 'y'])

# 创建实例
p = Point(11, 22)
print(p.x)    # 11
print(p.y)    # 22
print(p[0])   # 11（也可以用索引）

4.2 定义方式

python

from collections import namedtuple

# 方式1：列表
Point = namedtuple('Point', ['x', 'y'])

# 方式2：空格分隔的字符串
Point = namedtuple('Point', 'x y')

# 方式3：逗号分隔的字符串
Point = namedtuple('Point', 'x, y')

4.3 使用方法

python

from collections import namedtuple

Point = namedtuple('Point', ['x', 'y'])
p = Point(11, 22)

# 转为字典
p._asdict()  # {'x': 11, 'y': 22}

# 替换字段（返回新实例）
p2 = p._replace(x=33)
print(p2)  # Point(x=33, y=22)

# 获取字段名
Point._fields  # ('x', 'y')

# 从可迭代对象创建
Point._make([11, 22])  # Point(x=11, y=22)

4.4 实用场景

python

from collections import namedtuple

# 表示数据库记录
User = namedtuple('User', ['id', 'name', 'email'])
user = User(1, "大志", "test@example.com")

# 表示配置
Config = namedtuple('Config', ['host', 'port', 'debug'])
config = Config(host="localhost", port=8080, debug=True)

# 作为轻量级类
Color = namedtuple('Color', ['red', 'green', 'blue'])
red = Color(255, 0, 0)

五、ChainMap：链式映射

把多个字典组合成一个视图，查找时按顺序搜索。

5.1 基本用法

python

from collections import ChainMap

defaults = {"color": "red", "size": "large"}
user_prefs = {"color": "blue"}

# 组合两个字典
config = ChainMap(user_prefs, defaults)

# 查找：先搜索user_prefs，再搜索defaults
config["color"]  # 'blue'（user_prefs中的值）
config["size"]   # 'large'（defaults中的值）

5.2 实用场景

python

from collections import ChainMap

# 配置管理：默认配置 < 环境配置 < 用户配置
default_config = {"host": "localhost", "port": 8080, "debug": False}
env_config = {"port": 9090}
user_config = {"debug": True}

config = ChainMap(user_config, env_config, default_config)
print(config["host"])   # localhost
print(config["port"])   # 9090
print(config["debug"])  # True

# 命令行参数解析
import os
import argparse

defaults = {"verbose": False, "output": "result.txt"}
env_vars = {k: v for k, v in os.environ.items() if k.startswith("APP_")}
cli_args = vars(parser.parse_args())

config = ChainMap(cli_args, env_vars, defaults)

5.3 修改行为

python

from collections import ChainMap

cm = ChainMap({}, {"a": 1, "b": 2})

# 写入/更新只影响第一个字典
cm["a"] = 10
cm["c"] = 3
print(cm)
# ChainMap({'a': 10, 'c': 3}, {'a': 1, 'b': 2})

# 删除只影响第一个字典
del cm["a"]

# 创建新的子上下文
child = cm.new_child({"x": 100})
print(child)
# ChainMap({'x': 100}, {'c': 3}, {'a': 1, 'b': 2})

# 获取父上下文（去掉第一个映射）
cm.parents

六、OrderedDict：有序字典

Python 3.7+的普通字典已经保证插入顺序了，OrderedDict的主要用途是提供顺序相关的额外方法。

6.1 与普通字典的区别

python

from collections import OrderedDict

# 顺序比较
od1 = OrderedDict(a=1, b=2)
od2 = OrderedDict(b=2, a=1)
od1 == od2  # False（顺序不同）

d1 = dict(a=1, b=2)
d2 = dict(b=2, a=1)
d1 == d2    # True（普通字典不比较顺序）

6.2 move_to_end()

python

from collections import OrderedDict

od = OrderedDict(a=1, b=2, c=3)

# 移到末尾
od.move_to_end("a")
print(od)  # OrderedDict([('b', 2), ('c', 3), ('a', 1)])

# 移到开头
od.move_to_end("c", last=False)
print(od)  # OrderedDict([('c', 3), ('b', 2), ('a', 1)])

6.3 实现LRU缓存

python

from collections import OrderedDict

class LRUCache:
    def __init__(self, capacity):
        self.cache = OrderedDict()
        self.capacity = capacity
    
    def get(self, key):
        if key in self.cache:
            self.cache.move_to_end(key)
            return self.cache[key]
        return -1
    
    def put(self, key, value):
        if key in self.cache:
            self.cache.move_to_end(key)
        self.cache[key] = value
        if len(self.cache) > self.capacity:
            self.cache.popitem(last=False)

cache = LRUCache(2)
cache.put("a", 1)
cache.put("b", 2)
cache.get("a")     # 1
cache.put("c", 3)  # "b"被淘汰

七、UserDict、UserList、UserString

这些是内置类型的包装器，主要用于继承。

python

from collections import UserDict

class CaseInsensitiveDict(UserDict):
    """键不区分大小写的字典"""
    def __setitem__(self, key, value):
        super().__setitem__(key.lower(), value)
    
    def __getitem__(self, key):
        return super().__getitem__(key.lower())

d = CaseInsensitiveDict()
d["Name"] = "大志"
print(d["name"])   # 大志
print(d["NAME"])   # 大志

八、总结

collections模块的核心类：

类	用途
`Counter`	统计元素出现次数
`defaultdict`	带默认值的字典
`deque`	双端队列，两端快速操作
`namedtuple`	命名元组，用名字访问字段
`ChainMap`	组合多个字典为一个视图
`OrderedDict`	有序字典，提供顺序相关方法

使用场景速查：

场景	用什么
统计词频	`Counter`
按条件分组	`defaultdict(list)`
计数器	`defaultdict(int)` 或 `Counter`
最近N条记录	`deque(maxlen=N)`
BFS队列	`deque`
轻量级数据对象	`namedtuple`
合并配置	`ChainMap`

Counter和defaultdict用得最多，记住它们就够了。

09 collections高级容器 ​

一、Counter：计数器 ​

1.1 基本用法 ​

1.2 获取计数 ​

1.3 更新计数 ​

1.4 集合运算 ​

1.5 实用场景 ​

二、defaultdict：带默认值的字典 ​

2.1 基本用法 ​

2.2 分组数据 ​

2.3 自定义默认工厂 ​

三、deque：双端队列 ​

3.1 基本操作 ​

3.2 旋转 ​

3.3 固定长度 ​

3.4 实用场景 ​

四、namedtuple：命名元组 ​

4.1 创建namedtuple ​

4.2 定义方式 ​

4.3 使用方法 ​

4.4 实用场景 ​

五、ChainMap：链式映射 ​

5.1 基本用法 ​

5.2 实用场景 ​

5.3 修改行为 ​

六、OrderedDict：有序字典 ​

6.1 与普通字典的区别 ​

6.2 move_to_end() ​

6.3 实现LRU缓存 ​

七、UserDict、UserList、UserString ​

八、总结 ​

09 collections高级容器

一、Counter：计数器

1.1 基本用法

1.2 获取计数

1.3 更新计数

1.4 集合运算

1.5 实用场景

二、defaultdict：带默认值的字典

2.1 基本用法

2.2 分组数据

2.3 自定义默认工厂

三、deque：双端队列

3.1 基本操作

3.2 旋转

3.3 固定长度

3.4 实用场景

四、namedtuple：命名元组

4.1 创建namedtuple

4.2 定义方式

4.3 使用方法

4.4 实用场景

五、ChainMap：链式映射

5.1 基本用法

5.2 实用场景

5.3 修改行为

六、OrderedDict：有序字典

6.1 与普通字典的区别

6.2 move_to_end()

6.3 实现LRU缓存

七、UserDict、UserList、UserString

八、总结